{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python3",
   "display_name": "DataAnalysis"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "%matplotlib inline"
   ]
  },
  {
   "source": [
    "# Chap 3 处理原始文本\n",
    "1.  如何访问文件内的文本？\n",
    "2.  如何将文档分割成单独的单词和标点符号，从而进行文本语料上的分析？\n",
    "3.  如何产生格式化的输出，并把结果保存在文件中？"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "## 3.8 分割(Segmentation)(P121)"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "### 3.8.1 句分割，断句，Sentence Segmentation(P122)"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "20.250994070456922"
      ]
     },
     "metadata": {},
     "execution_count": 2
    }
   ],
   "source": [
    "# 计算布朗语料库中每个句子的平均词数\n",
    "len(nltk.corpus.brown.words()) / len(nltk.corpus.brown.sents())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "['In the wild events which were to follow this girl had no\\npart at all; he never saw her again until all his tale was over.',\n",
       " 'And yet, in some indescribable way, she kept recurring like a\\nmotive in music through all his mad adventures afterwards, and the\\nglory of her strange hair ran like a red thread through those dark\\nand ill-drawn tapestries of the night.',\n",
       " 'For what followed was so\\nimprobable, that it might well have been a dream.',\n",
       " 'When Syme went out into the starlit street, he found it for the\\nmoment empty.']"
      ]
     },
     "metadata": {},
     "execution_count": 3
    }
   ],
   "source": [
    "# Punkt 句子分割器\n",
    "sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')\n",
    "text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')\n",
    "sents = sent_tokenizer.tokenize(text)  # 转为使用Punkt句子分割器\n",
    "sents[171:175]"
   ]
  },
  {
   "source": [
    "### 3.8.2 词分割，分词(Word Segmentation)(P123)"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Ex3-2\n",
    "def segment(text, segs):\n",
    "    words = []\n",
    "    last = 0\n",
    "    for i in range(len(segs)):\n",
    "        if segs[i] == '1':\n",
    "            words.append(text[last:i + 1])\n",
    "            last = i + 1\n",
    "    words.append(text[last:])\n",
    "    return words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "seg1=  ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']\nseg2=  ['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']\nseg3=  ['doyou', 'see', 'thekitt', 'y', 'see', 'thedogg', 'y', 'doyou', 'like', 'thekitt', 'y', 'like', 'thedogg', 'y']\n"
     ]
    }
   ],
   "source": [
    "text = \"doyouseethekittyseethedoggydoyoulikethekittylikethedoggy\"\n",
    "\n",
    "seg1 = \"0000000000000001000000000010000000000000000100000000000\"\n",
    "print(\"seg1= \", segment(text, seg1))\n",
    "\n",
    "seg2 = \"0100100100100001001001000010100100010010000100010010000\"\n",
    "print(\"seg2= \", segment(text, seg2))\n",
    "\n",
    "seg3 = \"0000100100000011001000000110000100010000001100010000001\"\n",
    "print(\"seg3= \", segment(text, seg3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "目标函数=  47\n"
     ]
    }
   ],
   "source": [
    "# 图3-6：目标函数=词典的大小+从词典中重构源文本所需要的信息量\n",
    "words = segment(text, seg2)\n",
    "text_size = len(words)\n",
    "lexicon_size = len(' '.join(list(set(words))))\n",
    "print(\"目标函数= \",text_size + lexicon_size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# P124 Ex3-3 计算存储词典和重构源文本的成本，计算目标函数，评价分词质量，得分越小越好\n",
    "def evaluate(text, segs):\n",
    "    words = segment(text, segs)\n",
    "    text_size = len(words)\n",
    "    lexicon_size = len(' '.join(list(set(words))))\n",
    "    return text_size + lexicon_size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "seg1 目标函数=  63\nseg2 目标函数=  47\nseg3 目标函数=  46\n"
     ]
    }
   ],
   "source": [
    "print(\"seg1 目标函数= \",evaluate(text, seg1))\n",
    "print(\"seg2 目标函数= \",evaluate(text, seg2))\n",
    "print(\"seg3 目标函数= \",evaluate(text, seg3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# P125 Ex3-4 使用模拟退火算法的非确定性搜索；\n",
    "# 1) 一开始仅搜索短语分词；\n",
    "# 2) 然后随机扰动0和1，它们与“温度”成比例；\n",
    "# 3) 每次迭代温度都会降低，扰动边界会减少。\n",
    "\n",
    "from random import randint\n",
    "\n",
    "\n",
    "def flip(segs, pos):\n",
    "    return segs[:pos] + str(1 - int(segs[pos])) + segs[pos + 1:]\n",
    "\n",
    "\n",
    "def flip_n(segs, n):\n",
    "    for i in range(n):\n",
    "        segs = flip(segs, randint(0, len(segs) - 1))\n",
    "    return segs\n",
    "\n",
    "\n",
    "def anneal(text, segs, iterations, cooling_rate):\n",
    "    temperature = float(len(segs))\n",
    "    while temperature > 0.5:\n",
    "        best_segs, best = segs, evaluate(text, segs)\n",
    "        for i in range(iterations):\n",
    "            guess = flip_n(segs, int(round(temperature)))\n",
    "            score = evaluate(text, guess)\n",
    "            if score < best:\n",
    "                best_segs, best = guess, score\n",
    "        segs, score = best_segs, best\n",
    "        temperature = temperature / cooling_rate\n",
    "        print(evaluate(text, segs), segment(text, segs))\n",
    "    print()\n",
    "    return segs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']\n",
      "63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']\n",
      "63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']\n",
      "63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']\n",
      "63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']\n",
      "63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']\n",
      "63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']\n",
      "62 ['doyou', 'see', 'the', 'kittys', 'eethedo', 'ggy', 'doyou', 'like', 'the', 'kitty', 'like', 'thedo', 'ggy']\n",
      "62 ['doyou', 'see', 'the', 'kittys', 'eethedo', 'ggy', 'doyou', 'like', 'the', 'kitty', 'like', 'thedo', 'ggy']\n",
      "62 ['doyou', 'see', 'the', 'kittys', 'eethedo', 'ggy', 'doyou', 'like', 'the', 'kitty', 'like', 'thedo', 'ggy']\n",
      "58 ['doyou', 'see', 'the', 'kit', 'ty', 'see', 'thedo', 'ggy', 'doyou', 'like', 'thekitty', 'like', 'thedo', 'ggy']\n",
      "56 ['doyou', 'see', 'th', 'ekitty', 'see', 'thedo', 'ggy', 'doyo', 'u', 'like', 'th', 'ekitty', 'like', 'thedo', 'ggy']\n",
      "54 ['doyou', 'see', 'th', 'ekitty', 'see', 'thedo', 'g', 'g', 'y', 'doyou', 'like', 'th', 'ekitty', 'like', 'thedo', 'g', 'gy']\n",
      "51 ['doyou', 'see', 'th', 'ekitty', 'see', 'thedo', 'g', 'gy', 'doyou', 'like', 'th', 'ekitty', 'like', 'thedo', 'g', 'gy']\n",
      "51 ['doyou', 'see', 'th', 'ekitty', 'see', 'thedo', 'g', 'gy', 'doyou', 'like', 'th', 'ekitty', 'like', 'thedo', 'g', 'gy']\n",
      "51 ['doyou', 'see', 'th', 'ekitty', 'see', 'thedo', 'g', 'gy', 'doyou', 'like', 'th', 'ekitty', 'like', 'thedo', 'g', 'gy']\n",
      "51 ['doyou', 'see', 'th', 'ekitty', 'see', 'thedo', 'g', 'gy', 'doyou', 'like', 'th', 'ekitty', 'like', 'thedo', 'g', 'gy']\n",
      "48 ['doyou', 'see', 'th', 'ekitty', 'see', 'thedo', 'ggy', 'doyou', 'like', 'th', 'ekitty', 'like', 'thedo', 'ggy']\n",
      "45 ['doyou', 'see', 'thekitty', 'see', 'thedo', 'ggy', 'doyou', 'like', 'thekitty', 'like', 'thedo', 'ggy']\n",
      "42 ['doyou', 'see', 'thekitty', 'see', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']\n",
      "42 ['doyou', 'see', 'thekitty', 'see', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']\n",
      "42 ['doyou', 'see', 'thekitty', 'see', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']\n",
      "42 ['doyou', 'see', 'thekitty', 'see', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']\n",
      "42 ['doyou', 'see', 'thekitty', 'see', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']\n",
      "42 ['doyou', 'see', 'thekitty', 'see', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']\n",
      "42 ['doyou', 'see', 'thekitty', 'see', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']\n",
      "\n",
      "anneal(seg1)=  0000100100000001001000000010000100010000000100010000000\n"
     ]
    }
   ],
   "source": [
    "print(\"anneal(seg1)= \", anneal(text, seg1, 5000, 1.2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "47 ['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']\n",
      "47 ['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']\n",
      "47 ['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']\n",
      "47 ['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']\n",
      "47 ['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']\n",
      "47 ['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']\n",
      "47 ['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']\n",
      "47 ['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']\n",
      "47 ['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']\n",
      "47 ['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']\n",
      "47 ['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']\n",
      "47 ['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']\n",
      "47 ['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']\n",
      "47 ['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']\n",
      "47 ['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']\n",
      "47 ['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']\n",
      "47 ['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']\n",
      "44 ['doyou', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'doyou', 'like', 'the', 'kitty', 'like', 'the', 'doggy']\n",
      "44 ['doyou', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'doyou', 'like', 'the', 'kitty', 'like', 'the', 'doggy']\n",
      "44 ['doyou', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'doyou', 'like', 'the', 'kitty', 'like', 'the', 'doggy']\n",
      "44 ['doyou', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'doyou', 'like', 'the', 'kitty', 'like', 'the', 'doggy']\n",
      "44 ['doyou', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'doyou', 'like', 'the', 'kitty', 'like', 'the', 'doggy']\n",
      "44 ['doyou', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'doyou', 'like', 'the', 'kitty', 'like', 'the', 'doggy']\n",
      "44 ['doyou', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'doyou', 'like', 'the', 'kitty', 'like', 'the', 'doggy']\n",
      "44 ['doyou', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'doyou', 'like', 'the', 'kitty', 'like', 'the', 'doggy']\n",
      "44 ['doyou', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'doyou', 'like', 'the', 'kitty', 'like', 'the', 'doggy']\n",
      "\n",
      "anneal(seg2)=  0000100100100001001001000010000100010010000100010010000\n"
     ]
    }
   ],
   "source": [
    "# 小的得分，不一定是合理的分词结果，说明评价函数存在问题\n",
    "print(\"anneal(seg2)= \", anneal(text, seg2, 5000, 1.2))"
   ]
  }
 ]
}